# R packages
library(ggplot2)
library(openxlsx)
library(reshape2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(RColorBrewer)
library(circlize)
## ========================================
## circlize version 0.4.15
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: https://jokergoo.github.io/circlize_book/book/
##
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization
## in R. Bioinformatics 2014.
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(circlize))
## ========================================
library(plyr)
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:plotly':
##
## arrange, mutate, rename, summarise
# Bioconductor packages
library(limma)
library(edgeR)
library(variancePartition)
## Loading required package: BiocParallel
##
## Attaching package: 'variancePartition'
## The following object is masked from 'package:limma':
##
## topTable
library(GSVA)
library(ComplexHeatmap)
## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.18.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
##
## If you use it in published research, please cite either one:
## - Gu, Z. Complex Heatmap Visualization. iMeta 2022.
## - Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional
## genomic data. Bioinformatics 2016.
##
##
## The new InteractiveComplexHeatmap package can directly export static
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
##
## Attaching package: 'ComplexHeatmap'
## The following object is masked from 'package:plotly':
##
## add_heatmap
theme_set(theme_classic())
theme_update(axis.text=element_text(size=12),axis.title=element_text(size=14,face="bold"),strip.text=element_text(size=12),legend.text = element_text(size=12),legend.title = element_text(size=14),legend.key.size = unit(0.3,"cm"),title=element_text(size=14),line=element_line(size=0.5),rect=element_rect(size=0.5),legend.margin=margin(0,0,0,0),legend.box.margin=margin(0,0,0,0))
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Plot PCA
format_pca = function(pca,metadata,merge_name){
pca_coords = as.data.frame(pca$x)
pca_coords[[merge_name]] = rownames(pca_coords)
pca_coords = merge(pca_coords,metadata,all.x=TRUE)
lab_pc = list()
lab_pc = lapply(seq(1,length(pca$sdev)),function(x){
lab_pc[[x]] = paste("PC",x,": ",round(pca$sdev[x]^2/sum(pca$sdev^2)*100,1),"%",sep="")
})
pca_loadings = as.data.frame(pca$rotation)
pca_loadings$Element = rownames(pca_loadings)
pca_list = list(pca_coords,lab_pc,pca_loadings)
names(pca_list) = c("coords","labels","loadings")
return(pca_list)
}
From George et al, Supplementary Table 1 (https://www.nature.com/articles/nature14664)
metadata_george = read.xlsx("metadata/41586_2015_BFnature14664_MOESM72_ESM.xlsx",sheet="Suppl_Table1",startRow = 4,sep.names=" ")
metadata_george_wxs = metadata_george[which(metadata_george$`Whole genome sequencing` != "no"),]
metadata_george_wxs$`Sample-ID` = gsub(" ","",metadata_george_wxs$`Sample-ID`)
metadata_george = metadata_george[which(metadata_george$`RNA-seq` != "no"),]
metadata_george$`Sample-ID` = gsub(" ","",metadata_george$`Sample-ID`)
george_purity = read.xlsx("metadata/41586_2015_BFnature14664_MOESM72_ESM.xlsx",sheet="Suppl_Table2",startRow = 5,sep.names=" ")
george_purity = george_purity[which(george_purity$`sequencing method` %in% c("WGS","WGS (T), WES (N)")),]
Add placeholder for missing metadata
metadata_george[which(is.na(metadata_george$ethnicity)),]$ethnicity = "Unknown"
From:
# RNAseq sample links
metadata_lissa = read.xlsx("metadata/SCLC Lissa metadata.xlsx", sheet="Sample links",sep.names=" ")
colnames(metadata_lissa) = gsub("sample_id","NCI_sample_ID.RNA",gsub("Submitted to ","",colnames(metadata_lissa)))
# Supplementary Table 2
t2 = read.xlsx("metadata/SCLC Lissa metadata.xlsx", sheet="Supp Table 2",startRow = 3,sep.names=" ")[1:66,]
t2 = t2[,c("Sample id","Patient id","Biopsy site","Biopsy type")]
metadata_lissa = merge(metadata_lissa,t2,
by.x=c("dbGap sample ID","dbGap patient ID"),
by.y=c("Sample id","Patient id"),all=TRUE)
# Supplementary Table 3
t3 = read.xlsx("metadata/SCLC Lissa metadata.xlsx", sheet="Supp Table 3",startRow = 3,sep.names=" ")[1:100,]
metadata_lissa = merge(metadata_lissa,t3,
by.x=c("dbGap sample ID","dbGap patient ID"),
by.y=c("Sample id","Patient id"),all=TRUE)
# Additional clinical metadata
clinical = read.xlsx("metadata/Clinical characteritics.xlsx",startRow = 3)
clinical = clinical[,c("sample_id","Cohort","Race(s)")]
metadata_lissa = merge(metadata_lissa,clinical,
by.x="NCI_sample_ID.RNA",
by.y="sample_id",all.x=TRUE)
rm(list=c("t2","t3","clinical"))
RNA (100 samples)
# dbGaP
sra = read.xlsx("metadata/SCLC Lissa metadata.xlsx", sheet="SraRunTable",sep.names=" ")
sra = sra[which(sra$`Assay Type` == "RNA-Seq"),]
sra = sra[,c("Sample Name","submitted_subject_id","body_site","sex")]
sra$`Sample Name` = gsub("_RNA","",sra$`Sample Name`)
metadata_lissa = merge(metadata_lissa, sra,
by.x=c("dbGap sample ID","dbGap patient ID"),
by.y=c("Sample Name","submitted_subject_id"))
# Clinomics portal
clinomics = read.xlsx("metadata/SCLC Lissa Clinomics portal.xlsx",sep.names=" ")
clinomics = clinomics[,c("Sample Name","Patient ID","Library Type","Diagnosis","FFPE or Fresh Frozen","RIN","DV200")]
metadata_lissa = merge(metadata_lissa,clinomics,
by.x=c("NCI_sample_ID.RNA","patient_id"),
by.y=c("Sample Name","Patient ID"),all=TRUE)
rm(list=c("sra","clinomics"))
WXS (34 tumor/normal pairs)
# Nobu WXS links
wxs_links = read.xlsx("metadata/Lissa_et_al_Nat_Commun_WES_sample_matching_NT20240111.xlsx")
# Clinomics portal
clinomics = read.xlsx("metadata/SCLC Lissa Clinomics portal WXS.xlsx",sep.names=" ")
clinomics = clinomics[,c("Sample Name","Patient ID","Library Type","Diagnosis","FFPE or Fresh Frozen",
"Tumor|Normal","Normal sample","Rnaseq sample")]
clinomics = merge(clinomics,wxs_links[,c("dbGaP_sample_ID","NCI_sample_ID")],
by.x="Sample Name",
by.y="NCI_sample_ID",all.y=TRUE)
# Merge tumor/normal pairs
clinomics = cbind(clinomics,colsplit(clinomics$dbGaP_sample_ID,"_",c("dbGaP sample ID","Material")))
tumor = clinomics[which(clinomics$Material == "DNA"),]
normal = clinomics[which(clinomics$Material == "PBMC"),]
clinomics = merge(tumor[,c("dbGaP sample ID","Sample Name")],
normal[,c("dbGaP sample ID","Sample Name","Patient ID","Library Type","Diagnosis")],
by=c("dbGaP sample ID"),all=TRUE)
colnames(clinomics) = mapvalues(colnames(clinomics),
c("Sample Name.x","Sample Name.y","Library Type","Diagnosis"),
c("NCI_sample_ID.tumor","NCI_sample_ID.normal","Library Type.WXS","Diagnosis.WXS"))
# Combine
metadata_lissa = merge(metadata_lissa, clinomics,
by.x=c("dbGap sample ID","patient_id"),
by.y=c("dbGaP sample ID","Patient ID"),all=TRUE)
rm(list=c("wxs_links","clinomics","tumor","normal"))
Add placeholders for missing metadata; storage method: “FFPE” and library method: “RNA access” where unspecified
metadata_lissa[which(is.na(metadata_lissa$sex)),]$sex = "Unknown"
metadata_lissa[which(is.na(metadata_lissa$`Race(s)`)),]$`Race(s)` = "Unknown"
metadata_lissa[which(is.na(metadata_lissa$`Biopsy timepoint`)),]$`Biopsy timepoint` = "Unknown"
metadata_lissa[which(is.na(metadata_lissa$`Biopsy type`)),]$`Biopsy type` = "Unknown"
metadata_lissa[which(is.na(metadata_lissa$`FFPE or Fresh Frozen`)),]$`FFPE or Fresh Frozen` = "FFPE"
metadata_lissa[which(is.na(metadata_lissa$`Library Type`)),]$`Library Type` = "access"
Raw counts and TPM generated using CCBR Pipeliner (RENEE) from:
Load raw counts; remove “T” suffix to match metadata
george.raw = read.table("rnaseq/George/analysis/DEG_ALL/RSEM.genes.expected_count.all_samples.txt",sep='\t',header=TRUE)
rownames(george.raw) = george.raw$gene_id
george.raw = george.raw[,3:59]
colnames(george.raw) = gsub("T$","",colnames(george.raw))
Filter metadata to samples with fastq; specify library method: “polyA” for remaining samples (all from George et al original cohort)
metadata_george_filter = metadata_george[which(metadata_george$`Sample-ID` %in% colnames(george.raw)),]
metadata_george_filter$`Library Type` = "PolyA"
Load TPM, convert to log2
george.tpm = read.table("rnaseq/George/analysis/DEG_ALL/RSEM.genes.TPM.all_samples.txt",sep='\t',header=TRUE)
rownames(george.tpm) = george.tpm$gene_id
george.tpm = george.tpm[,3:59]
colnames(george.tpm) = gsub("T$","",colnames(george.tpm))
george.tpm = log2(george.tpm + 1)
Load raw counts
lissa.raw = read.table("rnaseq/Lissa/analysis100/DEG_ALL/RSEM.genes.expected_count.all_samples.txt",sep='\t',header=TRUE)
genes = lissa.raw[,1:2]
rownames(lissa.raw) = lissa.raw$gene_id
lissa.raw = lissa.raw[,metadata_lissa$NCI_sample_ID.RNA]
Load TPM, convert to log2
lissa.tpm = read.table("rnaseq/Lissa/analysis100/DEG_ALL/RSEM.genes.TPM.all_samples.txt",sep='\t',header=TRUE)
rownames(lissa.tpm) = lissa.tpm$gene_id
lissa.tpm = lissa.tpm[,metadata_lissa$NCI_sample_ID.RNA]
lissa.tpm = log2(lissa.tpm + 1)
combine.raw = cbind(george.raw,lissa.raw)
metadata_combined = data.frame(Sample = c(metadata_george_filter$`Sample-ID`,metadata_lissa$NCI_sample_ID.RNA),
Cohort = c(rep("George",57),rep("Lissa",100)),
Timepoint = c(metadata_george_filter$`previous therapeutic treatment for SCLC`,metadata_lissa$`Biopsy timepoint`),
Storage = c(metadata_george_filter$Tumor,metadata_lissa$`FFPE or Fresh Frozen`),
Library = c(metadata_george_filter$`Library Type`,metadata_lissa$`Library Type`),
Ethnicity = c(metadata_george_filter$ethnicity,metadata_lissa$`Race(s)`))
From raw counts, filter lowly expressed genes, perform TMM normalization and voom
# Create DGEList
george.dge = DGEList(counts=george.raw)
# Filter lowly expressed genes
design <- model.matrix( ~ `primary tumor/metastasis` + `previous therapeutic treatment for SCLC`, metadata_george_filter)
george.dge <- george.dge[filterByExpr(george.dge,design),, keep.lib.sizes=FALSE]
# TMM normalization
george.dge = calcNormFactors(george.dge,method="TMM")
# Voom normalization
george.voom <- voom(george.dge,design)
george.voom = george.voom$E
# Create DGEList
lissa.dge = DGEList(counts=lissa.raw)
# Filter lowly expressed genes
design <- model.matrix( ~ Disease + `Biopsy timepoint`, metadata_lissa)
lissa.dge <- lissa.dge[filterByExpr(lissa.dge,design),, keep.lib.sizes=FALSE]
# TMM normalization
lissa.dge = calcNormFactors(lissa.dge,method="TMM")
# Voom normalization
lissa.voom <- voom(lissa.dge,design)
lissa.voom = lissa.voom$E
# Create DGEList
combine.dge = DGEList(counts=combine.raw)
# Filter lowly expressed genes
design <- model.matrix( ~ Cohort, metadata_combined)
combine.dge <- combine.dge[filterByExpr(combine.dge,design),, keep.lib.sizes=FALSE]
# TMM normalization
combine.dge = calcNormFactors(combine.dge,method="TMM")
# Voom normalization
combine.voom <- voom(combine.dge,design)
combine.voom = combine.voom$E
rm(list=c("george.raw","lissa.raw"))
Identify major sources of variation in the combined and separate cohorts to determine whether batch correction should be performed.
On 3k most variable genes
combine.voom.3k = combine.voom[order(apply(combine.voom,1,mad),decreasing = TRUE),][1:3000,]
combine.voom.pca = prcomp(t(combine.voom.3k),scale=TRUE,center=TRUE)
combine.voom.pca = format_pca(combine.voom.pca,metadata_combined,"Sample")
# 2D
ggplot(combine.voom.pca$coords,aes(x=PC1,y=PC2,color=Cohort)) + geom_point(size=2) + coord_fixed() + xlab(combine.voom.pca$labels[[1]]) + ylab(combine.voom.pca$labels[[2]]) + theme(aspect.ratio=1) + ggtitle("George/Lissa, by cohort")
# 2D
ggplot(combine.voom.pca$coords,aes(x=PC1,y=PC2,color=Library,shape=Cohort)) + geom_point(size=2) + coord_fixed() + xlab(combine.voom.pca$labels[[1]]) + ylab(combine.voom.pca$labels[[2]]) + theme(aspect.ratio=1) + ggtitle("George/Lissa, by library method / cohort")
# 3D
fig <- plot_ly(combine.voom.pca$coords, x = ~`PC1`, y = ~`PC2`, z = ~`PC3`,color = ~Library)
fig <- fig %>% add_markers()
fig <- fig %>% layout(title = "George/Lissa, by library method",
scene = list(xaxis = list(title = 'PC1'),
yaxis = list(title = 'PC2'),
zaxis = list(title = 'PC3')))
fig
Conclusion: The George et al and Lissa et al cohorts separate along PC1, driven primarily by library method. Combining the two cohorts will be difficult, as several technical variables are co-linear with the cohort (e.g., untreated primary tumor vs relapse/metastasis).
Summary of variables and variable correlation
apply(metadata_george_filter[,c("sex", "age", "ethnicity", "Pathology review 2", "primary tumor/metastasis","previous therapeutic treatment for SCLC", "stage_UICC", "tissue sampling","Tumor")],2,function(x) table(x,useNA="ifany"))
## $sex
## x
## female male
## 15 42
##
## $age
## x
## 48 51 52 53 54 57 58 59 61 62 63 64 65 66 67 68 70 71 73 74 75 76 77 81 82 83
## 1 1 1 2 1 2 2 2 5 1 7 5 3 1 2 5 4 1 1 2 1 2 1 1 1 2
##
## $ethnicity
## x
## Asian Caucasian Unknown
## 1 27 29
##
## $`Pathology review 2`
## x
## SCLC
## 57
##
## $`primary tumor/metastasis`
## x
## metastasis (pleura) primary
## 1 56
##
## $`previous therapeutic treatment for SCLC`
## x
## relapse untreated
## 1 56
##
## $stage_UICC
## x
## I Ia Ib IB IIa IIb IIIa IIIb IV
## 1 15 8 1 5 3 14 3 7
##
## $`tissue sampling`
## x
## biopsy pleural effusion surgical resection
## 1 1 55
##
## $Tumor
## x
## FF tissue
## 57
table(metadata_george_filter$sex,metadata_george_filter$ethnicity)
##
## Asian Caucasian Unknown
## female 1 3 11
## male 0 24 18
Determine % of variance attributable to each variable by gene (for 3k most variable genes)
george.voom.3k = george.voom[order(apply(george.voom,1,mad),decreasing = TRUE),][1:3000,]
form <- ~ `primary tumor/metastasis` + `previous therapeutic treatment for SCLC`
varPart <- fitExtractVarPartModel(george.voom.3k,form,metadata_george_filter)
## Warning in filterInputData(exprObj, formula, data, useWeights = useWeights): Sample names of responses (i.e. columns of exprObj) do not match
## sample names of metadata (i.e. rows of data). Recommend consistent
## names so downstream results are labeled consistently.
vp = melt(varPart,variable.name="Variable",value.name="Variance")
## No id variables; using all as measure variables
vp$Variance = 100*vp$Variance
ggplot(vp,aes(x=reorder(Variable,Variance,median),y=Variance)) + geom_boxplot(outlier.size=0.5,size=0.5) + ylab("Variance (%)") + xlab("Source of variation") + theme(axis.text.x=element_text(angle=45,hjust=1,vjust=1)) + ggtitle("George")
george.voom.pca = prcomp(t(george.voom.3k),scale=TRUE,center=TRUE)
george.voom.pca = format_pca(george.voom.pca,metadata_george_filter,"Sample-ID")
# 2D
ggplot(george.voom.pca$coords,aes(x=PC1,y=PC2,color=`primary tumor/metastasis`)) + geom_point(size=2) + coord_fixed() + xlab(george.voom.pca$labels[[1]]) + ylab(george.voom.pca$labels[[2]]) + theme(aspect.ratio=1) + ggtitle("George, by primary tumor vs. metastasis")
# 3D
fig <- plot_ly(george.voom.pca$coords, x = ~`PC1`, y = ~`PC2`, z = ~`PC3`,color = ~stage_UICC)
fig <- fig %>% add_markers()
fig <- fig %>% layout(title = "George, by stage UICC",
scene = list(xaxis = list(title = 'PC1'),
yaxis = list(title = 'PC2'),
zaxis = list(title = 'PC3')))
fig
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
Conclusion: The George et al cohort is relatively standardized, and little variation is attributable to the few metastasis / relapse samples.
Summary of variables and variable correlation
apply(metadata_lissa[,c("patient_id", "Cohort", "Race(s)", "sex", "Disease", "Age at diagnosis", "SCLC staging/initial diagnosis","body_site", "Biopsy type", "Biopsy timepoint","Library Type", "FFPE or Fresh Frozen")],2,function(x) table(x,useNA="ifany"))
## $patient_id
## x
## 4525930 7857901 7877742 7925529 7926637 7944275 7953756 7954499 7954517 7959576
## 1 2 1 1 2 2 1 1 1 2
## 7973317 8022288 CL0083 CL0106 CL0107 CL0108 CL0109 CL0110 CL0111 CL0114
## 1 1 1 2 1 2 1 1 1 1
## CL0116 CL0124 CL0125 CL0126 CL0146 CL0147 CL0148 CL0152 CL0157 CL0164
## 3 2 2 2 1 3 1 1 1 1
## CL0169 CL0170 CL0172 CL0173 CL0174 CL0176 CL0180 CL0186 CL0191 CL0193
## 1 1 1 1 1 1 2 2 1 1
## CL0196 CL0214 CL0230 CL0261 CL0267 NCI0402 NCI0422 ucmc_11 ucmc_12 ucmc_13
## 1 1 2 1 1 2 3 1 2 1
## ucmc_14 ucmc_15 ucmc_16 ucmc_17 ucmc_18 ucmc_19 ucmc_2 ucmc_21 ucmc_23 ucmc_24
## 1 3 2 1 2 1 1 1 2 1
## ucmc_25 ucmc_26 ucmc_3 ucmc_30 ucmc_31 ucmc_33 ucmc_35 ucmc_5 ucmc_6 ucmc_7
## 2 1 1 1 2 1 1 2 1 1
## ucmc_8 ucmc_9
## 1 1
##
## $Cohort
## x
## NCI Rochester
## 66 34
##
## $`Race(s)`
## x
## Asian Black or African American Unknown
## 1 9 2
## White
## 88
##
## $sex
## x
## female male Unknown
## 54 44 2
##
## $Disease
## x
## EP_bladder EP_cervix EP_EGFRmt_trSCLC EP_ovary
## 1 3 4 1
## EP_prostate EP_rectum EP_subglottis SCLC
## 1 1 1 88
##
## $`Age at diagnosis`
## x
## 29 37 39 41 50 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## 2 2 2 1 2 2 1 7 1 3 5 3 8 4 4 5 6 3 4 3 1 6 4 1 7 1
## 74 75 77 84 86
## 1 8 1 1 1
##
## $`SCLC staging/initial diagnosis`
## x
## Extensive stage Limited stage
## 74 26
##
## $body_site
## x
## Adrenal gland Axillary lymph node
## 5 1
## Bladder Bone
## 1 2
## Brain Inguinal lymph node
## 2 2
## Liver Lung
## 29 22
## Mediastinal lymph node Pelvic mass
## 17 2
## Pleural fluid Prostate
## 4 1
## Subglottis Supraclavicular lymph node
## 1 10
## Uterine
## 1
##
## $`Biopsy type`
## x
## Core biopsy Excision FNAC Unknown
## 51 11 4 34
##
## $`Biopsy timepoint`
## x
## Initial diagnosis Relapse Unknown
## 15 84 1
##
## $`Library Type`
## x
## access polya_stranded SmartRNA
## 95 3 2
##
## $`FFPE or Fresh Frozen`
## x
## FFPE Fresh Frozen frozen
## 89 1 10
canCorPairs( ~ Cohort + `Race(s)` + sex + Disease + `SCLC staging/initial diagnosis` + body_site + `Biopsy type` + `Biopsy timepoint` + `Library Type` + `FFPE or Fresh Frozen`, metadata_lissa)
## Warning in canCorPairs(~Cohort + `Race(s)` + sex + Disease + `SCLC staging/initial diagnosis` + : Regression model may be problematic.
## High colinearity between variables:
## Cohort and `Biopsy type`
## Cohort `Race(s)` sex Disease
## Cohort 1.0000000 0.22257979 0.21098641 0.26504327
## `Race(s)` 0.2225798 1.00000000 0.72677313 0.33446491
## sex 0.2109864 0.72677313 1.00000000 0.24274955
## Disease 0.2650433 0.33446491 0.24274955 1.00000000
## `SCLC staging/initial diagnosis` 0.3291866 0.16404733 0.09023723 0.36652389
## body_site 0.6260669 0.40709374 0.43934994 0.80632823
## `Biopsy type` 1.0000000 0.28842033 0.15818943 0.43279239
## `Biopsy timepoint` 0.3273268 0.07194976 0.13117847 0.25981803
## `Library Type` 0.1646610 0.41252863 0.10007676 0.24154358
## `FFPE or Fresh Frozen` 0.2523300 0.31285404 0.10189629 0.09179851
## `SCLC staging/initial diagnosis` body_site
## Cohort 0.32918659 0.6260669
## `Race(s)` 0.16404733 0.4070937
## sex 0.09023723 0.4393499
## Disease 0.36652389 0.8063282
## `SCLC staging/initial diagnosis` 1.00000000 0.5723118
## body_site 0.57231178 1.0000000
## `Biopsy type` 0.37660669 0.5880126
## `Biopsy timepoint` 0.20445731 0.3996479
## `Library Type` 0.18263411 0.4005026
## `FFPE or Fresh Frozen` 0.17335539 0.3451468
## `Biopsy type` `Biopsy timepoint`
## Cohort 1.0000000 0.32732684
## `Race(s)` 0.2884203 0.07194976
## sex 0.1581894 0.13117847
## Disease 0.4327924 0.25981803
## `SCLC staging/initial diagnosis` 0.3766067 0.20445731
## body_site 0.5880126 0.39964789
## `Biopsy type` 1.0000000 0.32027102
## `Biopsy timepoint` 0.3202710 1.00000000
## `Library Type` 0.1799431 0.11197140
## `FFPE or Fresh Frozen` 0.2085420 0.05258182
## `Library Type` `FFPE or Fresh Frozen`
## Cohort 0.1646610 0.25232997
## `Race(s)` 0.4125286 0.31285404
## sex 0.1000768 0.10189629
## Disease 0.2415436 0.09179851
## `SCLC staging/initial diagnosis` 0.1826341 0.17335539
## body_site 0.4005026 0.34514679
## `Biopsy type` 0.1799431 0.20854198
## `Biopsy timepoint` 0.1119714 0.05258182
## `Library Type` 1.0000000 0.47524569
## `FFPE or Fresh Frozen` 0.4752457 1.00000000
table(metadata_lissa$Cohort,metadata_lissa$`Biopsy type`)
##
## Core biopsy Excision FNAC Unknown
## NCI 51 11 4 0
## Rochester 0 0 0 34
Determine % of variance attributable to each variable by gene (for 3k most variable genes)
lissa.voom.3k = lissa.voom[order(apply(lissa.voom,1,mad),decreasing = TRUE),][1:3000,]
form <- ~ Cohort + Disease + `Biopsy timepoint` + `Library Type` + `FFPE or Fresh Frozen`
varPart <- fitExtractVarPartModel(lissa.voom.3k,form,metadata_lissa)
## Warning in filterInputData(exprObj, formula, data, useWeights = useWeights): Sample names of responses (i.e. columns of exprObj) do not match
## sample names of metadata (i.e. rows of data). Recommend consistent
## names so downstream results are labeled consistently.
vp = melt(varPart,variable.name="Variable",value.name="Variance")
## No id variables; using all as measure variables
vp$Variance = 100*vp$Variance
ggplot(vp,aes(x=reorder(Variable,Variance,median),y=Variance)) + geom_boxplot(outlier.size=0.5,size=0.5) + ylab("Variance (%)") + xlab("Source of variation") + theme(axis.text.x=element_text(angle=45,hjust=1,vjust=1)) + ggtitle("Lissa")
form <- ~ patient_id + `FFPE or Fresh Frozen`
varPart <- fitExtractVarPartModel(lissa.voom.3k,form,metadata_lissa)
## Warning in filterInputData(exprObj, formula, data, useWeights = useWeights): Sample names of responses (i.e. columns of exprObj) do not match
## sample names of metadata (i.e. rows of data). Recommend consistent
## names so downstream results are labeled consistently.
vp = melt(varPart,variable.name="Variable",value.name="Variance")
## No id variables; using all as measure variables
vp$Variance = 100*vp$Variance
ggplot(vp,aes(x=reorder(Variable,Variance,median),y=Variance)) + geom_boxplot(outlier.size=0.5,size=0.5) + ylab("Variance (%)") + xlab("Source of variation") + theme(axis.text.x=element_text(angle=45,hjust=1,vjust=1)) + ggtitle("Lissa")
lissa.voom.pca = prcomp(t(lissa.voom.3k),scale=TRUE,center=TRUE)
lissa.voom.pca = format_pca(lissa.voom.pca,metadata_lissa,"NCI_sample_ID.RNA")
# 2D
ggplot(lissa.voom.pca$coords,aes(x=PC1,y=PC2,color=`Biopsy timepoint`)) + geom_point(size=2) + coord_fixed() + xlab(lissa.voom.pca$labels[[1]]) + ylab(lissa.voom.pca$labels[[2]]) + theme(aspect.ratio=1) + ggtitle("Lissa, by timepoint")
# 2D
ggplot(lissa.voom.pca$coords,aes(x=PC1,y=PC2,color=Cohort)) + geom_point(size=2) + coord_fixed() + xlab(lissa.voom.pca$labels[[1]]) + ylab(lissa.voom.pca$labels[[2]]) + theme(aspect.ratio=1) + ggtitle("Lissa, by cohort")
# 2D
ggplot(lissa.voom.pca$coords,aes(x=PC1,y=PC2,color=`Library Type`)) + geom_point(size=2) + coord_fixed() + xlab(lissa.voom.pca$labels[[1]]) + ylab(lissa.voom.pca$labels[[2]]) + theme(aspect.ratio=1) + ggtitle("Lissa, by library method")
# 2D
ggplot(lissa.voom.pca$coords,aes(x=PC1,y=PC2,color=`FFPE or Fresh Frozen`)) + geom_point(size=2) + coord_fixed() + xlab(lissa.voom.pca$labels[[1]]) + ylab(lissa.voom.pca$labels[[2]]) + theme(aspect.ratio=1) + ggtitle("Lissa, by storage method")
# 3D
fig <- plot_ly(lissa.voom.pca$coords, x = ~`PC1`, y = ~`PC2`, z = ~`PC3`,color = ~body_site)
fig <- fig %>% add_markers()
fig <- fig %>% layout(title = "Lissa, by body site",
scene = list(xaxis = list(title = 'PC1'),
yaxis = list(title = 'PC2'),
zaxis = list(title = 'PC3')))
fig
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
Conclusion: The Lissa et al cohort exhibits greater technical and biological variability. Technical variables such as site, storage, or library method have a similar contribution to variance as disease. However, variation due to storage method is small compared to that by patient. The sampled body site separates samples along PC1 (primarily liver vs. lung/lymph node).
Extract CENPA expression (log2TPM) and split samples into CENPA-high/low based on median expression
metadata_george_filter$CENPA.tpm = unlist(george.tpm["ENSG00000115163.15",metadata_george_filter$`Sample-ID`])
summary(metadata_george_filter$CENPA.tpm)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.7137 4.6159 5.0921 4.9319 5.4754 6.5283
metadata_george_filter$CENPA.category = ifelse(metadata_george_filter$CENPA.tpm > median(metadata_george_filter$CENPA.tpm),"High","Low")
metadata_lissa$CENPA.tpm = unlist(lissa.tpm["ENSG00000115163.15",metadata_lissa$NCI_sample_ID.RNA])
summary(metadata_lissa$CENPA.tpm)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 2.008 2.974 2.780 3.570 4.980
metadata_lissa$CENPA.category = ifelse(metadata_lissa$CENPA.tpm > median(metadata_lissa$CENPA.tpm),"High","Low")
ces = c("CENPA","HJURP","MIS18BP1","MIS18A","OIP5","CENPC","CENPN","CENPI","CENPH","CENPT","CENPW","CENPS","CENPX","CENPM","CENPU","CENPL","CENPK","CENPO","CENPP","CENPQ","ITGB3BP","KNL1","ZWINT","MIS12","PMF1","NSL1","DSN1","NDC80","SPC24","SPC25","NUF2")
Calculate CES score as the sum of the log2(TPM) per gene and split samples into CES-high/low based on median expression
metadata_george_filter$CES.tpm = colSums(george.tpm[genes[match(ces,genes$GeneName),]$gene_id,metadata_george_filter$`Sample-ID`])
metadata_george_filter$CES.category = ifelse(metadata_george_filter$CES.tpm > median(metadata_george_filter$CES.tpm),"High","Low")
metadata_lissa$CES.tpm = colSums(lissa.tpm[genes[match(ces,genes$GeneName),]$gene_id,metadata_lissa$NCI_sample_ID.RNA])
metadata_lissa$CES.category = ifelse(metadata_lissa$CES.tpm > median(metadata_lissa$CES.tpm),"High","Low")
# Score vs score
ggplot(metadata_george_filter,aes(x=CENPA.tpm,y=CES.tpm)) + geom_point(aes(color=CENPA.category,shape=CES.category)) +
geom_vline(xintercept=median(metadata_george_filter$CENPA.tpm),linetype="dashed") +
geom_hline(yintercept=median(metadata_george_filter$CES.tpm),linetype="dashed") +
ggtitle("George, CENPA vs. CES") + xlab("CENPA expression (log2TPM)") + ylab("CES (log2TPM)") +
scale_color_discrete(name="CENPA category") + scale_shape_discrete(name="CES category") +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
cor.test(metadata_george_filter$CENPA.tpm,metadata_george_filter$CES.tpm,method="spearman")
##
## Spearman's rank correlation rho
##
## data: metadata_george_filter$CENPA.tpm and metadata_george_filter$CES.tpm
## S = 7908, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.7437127
# Category vs category
table(metadata_george_filter$CENPA.category,metadata_george_filter$CES.category)
##
## High Low
## High 24 4
## Low 4 25
chisq.test(metadata_george_filter$CENPA.category,metadata_george_filter$CES.category)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: metadata_george_filter$CENPA.category and metadata_george_filter$CES.category
## X-squared = 26.677, df = 1, p-value = 2.405e-07
# Score vs score
ggplot(metadata_lissa,aes(x=CENPA.tpm,y=CES.tpm)) + geom_point(aes(color=CENPA.category,shape=CES.category)) +
geom_vline(xintercept=median(metadata_lissa$CENPA.tpm),linetype="dashed") +
geom_hline(yintercept=median(metadata_lissa$CES.tpm),linetype="dashed") +
ggtitle("Lissa, CENPA vs. CES") + xlab("CENPA expression (log2TPM)") + ylab("CES (log2TPM)") +
scale_color_discrete(name="CENPA category") + scale_shape_discrete(name="CES category") +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
cor.test(metadata_lissa$CENPA.tpm,metadata_lissa$CES.tpm,method="spearman")
## Warning in cor.test.default(metadata_lissa$CENPA.tpm, metadata_lissa$CES.tpm, :
## Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: metadata_lissa$CENPA.tpm and metadata_lissa$CES.tpm
## S = 23002, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.8619768
# Category vs category
table(metadata_lissa$CENPA.category,metadata_lissa$CES.category)
##
## High Low
## High 40 10
## Low 10 40
chisq.test(metadata_lissa$CENPA.category,metadata_lissa$CES.category)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: metadata_lissa$CENPA.category and metadata_lissa$CES.category
## X-squared = 33.64, df = 1, p-value = 6.631e-09
Conclusion: There is a strong correlation between CENP-A expression and the CES score in both cohorts.
Compare CES score to technical and clinical metadata for each cohort.
apply(metadata_george_filter[,c("sex","primary tumor/metastasis","previous therapeutic treatment for SCLC","chemotherapy NEO-ADJUVANT (yes/no)","chemotherapy (yes/no)")],2,function(x) wilcox.test(metadata_george_filter$CES.tpm~x))
## $sex
##
## Wilcoxon rank sum exact test
##
## data: metadata_george_filter$CES.tpm by x
## W = 345, p-value = 0.5966
## alternative hypothesis: true location shift is not equal to 0
##
##
## $`primary tumor/metastasis`
##
## Wilcoxon rank sum test with continuity correction
##
## data: metadata_george_filter$CES.tpm by x
## W = 17, p-value = 0.5233
## alternative hypothesis: true location shift is not equal to 0
##
##
## $`previous therapeutic treatment for SCLC`
##
## Wilcoxon rank sum test with continuity correction
##
## data: metadata_george_filter$CES.tpm by x
## W = 37, p-value = 0.6054
## alternative hypothesis: true location shift is not equal to 0
##
##
## $`chemotherapy NEO-ADJUVANT (yes/no)`
##
## Wilcoxon rank sum test with continuity correction
##
## data: metadata_george_filter$CES.tpm by x
## W = 41, p-value = 0.1863
## alternative hypothesis: true location shift is not equal to 0
##
##
## $`chemotherapy (yes/no)`
##
## Wilcoxon rank sum exact test
##
## data: metadata_george_filter$CES.tpm by x
## W = 198, p-value = 0.7118
## alternative hypothesis: true location shift is not equal to 0
apply(metadata_george_filter[,c("ethnicity","stage_UICC","smoking_status","tissue sampling","radiation (yes/no)")],2,function(x) kruskal.test(metadata_george_filter$CES.tpm,x))
## $ethnicity
##
## Kruskal-Wallis rank sum test
##
## data: metadata_george_filter$CES.tpm and x
## Kruskal-Wallis chi-squared = 0.86092, df = 2, p-value = 0.6502
##
##
## $stage_UICC
##
## Kruskal-Wallis rank sum test
##
## data: metadata_george_filter$CES.tpm and x
## Kruskal-Wallis chi-squared = 9.2312, df = 8, p-value = 0.3232
##
##
## $smoking_status
##
## Kruskal-Wallis rank sum test
##
## data: metadata_george_filter$CES.tpm and x
## Kruskal-Wallis chi-squared = 2.856, df = 5, p-value = 0.7222
##
##
## $`tissue sampling`
##
## Kruskal-Wallis rank sum test
##
## data: metadata_george_filter$CES.tpm and x
## Kruskal-Wallis chi-squared = 0.73348, df = 2, p-value = 0.693
##
##
## $`radiation (yes/no)`
##
## Kruskal-Wallis rank sum test
##
## data: metadata_george_filter$CES.tpm and x
## Kruskal-Wallis chi-squared = 3.5903, df = 2, p-value = 0.1661
cor.test(metadata_george_filter$CES.tpm,as.numeric(metadata_george_filter$age))
##
## Pearson's product-moment correlation
##
## data: metadata_george_filter$CES.tpm and as.numeric(metadata_george_filter$age)
## t = -0.87346, df = 55, p-value = 0.3862
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.3663703 0.1481129
## sample estimates:
## cor
## -0.116969
Conclusion: No metadata is correlated with CES score in the George et al primary tumor cohort.
apply(metadata_lissa[,c("Cohort","Trial_ATRi","Trial_IC+ PARPi","Trial_IC","SCLC staging/initial diagnosis","Smoking history","Platinum sensitivity","IO prior to biopsy")],2,function(x) wilcox.test(metadata_lissa$CES.tpm~x))
## $Cohort
##
## Wilcoxon rank sum test with continuity correction
##
## data: metadata_lissa$CES.tpm by x
## W = 1163, p-value = 0.7682
## alternative hypothesis: true location shift is not equal to 0
##
##
## $Trial_ATRi
##
## Wilcoxon rank sum test with continuity correction
##
## data: metadata_lissa$CES.tpm by x
## W = 1056, p-value = 0.4928
## alternative hypothesis: true location shift is not equal to 0
##
##
## $`Trial_IC+ PARPi`
##
## Wilcoxon rank sum test with continuity correction
##
## data: metadata_lissa$CES.tpm by x
## W = 986, p-value = 0.6329
## alternative hypothesis: true location shift is not equal to 0
##
##
## $Trial_IC
##
## Wilcoxon rank sum test with continuity correction
##
## data: metadata_lissa$CES.tpm by x
## W = 1163, p-value = 0.7682
## alternative hypothesis: true location shift is not equal to 0
##
##
## $`SCLC staging/initial diagnosis`
##
## Wilcoxon rank sum test with continuity correction
##
## data: metadata_lissa$CES.tpm by x
## W = 766, p-value = 0.1245
## alternative hypothesis: true location shift is not equal to 0
##
##
## $`Smoking history`
##
## Wilcoxon rank sum test with continuity correction
##
## data: metadata_lissa$CES.tpm by x
## W = 229, p-value = 0.01477
## alternative hypothesis: true location shift is not equal to 0
##
##
## $`Platinum sensitivity`
##
## Wilcoxon rank sum test with continuity correction
##
## data: metadata_lissa$CES.tpm by x
## W = 1287, p-value = 0.3877
## alternative hypothesis: true location shift is not equal to 0
##
##
## $`IO prior to biopsy`
##
## Wilcoxon rank sum test with continuity correction
##
## data: metadata_lissa$CES.tpm by x
## W = 796, p-value = 0.4089
## alternative hypothesis: true location shift is not equal to 0
apply(metadata_lissa[,c("body_site","sex","Biopsy site","Biopsy type","Disease","Biopsy timepoint","Library Type","Diagnosis","FFPE or Fresh Frozen","Race(s)")],2,function(x) kruskal.test(metadata_lissa$CES.tpm,x))
## $body_site
##
## Kruskal-Wallis rank sum test
##
## data: metadata_lissa$CES.tpm and x
## Kruskal-Wallis chi-squared = 16.451, df = 14, p-value = 0.2866
##
##
## $sex
##
## Kruskal-Wallis rank sum test
##
## data: metadata_lissa$CES.tpm and x
## Kruskal-Wallis chi-squared = 3.9572, df = 2, p-value = 0.1383
##
##
## $`Biopsy site`
##
## Kruskal-Wallis rank sum test
##
## data: metadata_lissa$CES.tpm and x
## Kruskal-Wallis chi-squared = 34.569, df = 30, p-value = 0.2587
##
##
## $`Biopsy type`
##
## Kruskal-Wallis rank sum test
##
## data: metadata_lissa$CES.tpm and x
## Kruskal-Wallis chi-squared = 1.3415, df = 3, p-value = 0.7193
##
##
## $Disease
##
## Kruskal-Wallis rank sum test
##
## data: metadata_lissa$CES.tpm and x
## Kruskal-Wallis chi-squared = 8.557, df = 7, p-value = 0.286
##
##
## $`Biopsy timepoint`
##
## Kruskal-Wallis rank sum test
##
## data: metadata_lissa$CES.tpm and x
## Kruskal-Wallis chi-squared = 3.5726, df = 2, p-value = 0.1676
##
##
## $`Library Type`
##
## Kruskal-Wallis rank sum test
##
## data: metadata_lissa$CES.tpm and x
## Kruskal-Wallis chi-squared = 7.3679, df = 2, p-value = 0.02512
##
##
## $Diagnosis
##
## Kruskal-Wallis rank sum test
##
## data: metadata_lissa$CES.tpm and x
## Kruskal-Wallis chi-squared = 6.4716, df = 6, p-value = 0.3725
##
##
## $`FFPE or Fresh Frozen`
##
## Kruskal-Wallis rank sum test
##
## data: metadata_lissa$CES.tpm and x
## Kruskal-Wallis chi-squared = 2.188, df = 2, p-value = 0.3349
##
##
## $`Race(s)`
##
## Kruskal-Wallis rank sum test
##
## data: metadata_lissa$CES.tpm and x
## Kruskal-Wallis chi-squared = 3.3901, df = 3, p-value = 0.3353
apply(metadata_lissa[,c("Age at diagnosis","Number of systemic therapy","Age at biopsy timepoint")],2,function(x) cor.test(metadata_lissa$CES.tpm,as.numeric(x)))
## $`Age at diagnosis`
##
## Pearson's product-moment correlation
##
## data: metadata_lissa$CES.tpm and as.numeric(x)
## t = 0.17236, df = 98, p-value = 0.8635
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1796240 0.2130977
## sample estimates:
## cor
## 0.01740826
##
##
## $`Number of systemic therapy`
##
## Pearson's product-moment correlation
##
## data: metadata_lissa$CES.tpm and as.numeric(x)
## t = -0.91157, df = 98, p-value = 0.3642
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.2830154 0.1066443
## sample estimates:
## cor
## -0.09169449
##
##
## $`Age at biopsy timepoint`
##
## Pearson's product-moment correlation
##
## data: metadata_lissa$CES.tpm and as.numeric(x)
## t = 0.088373, df = 97, p-value = 0.9298
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1887736 0.2060195
## sample estimates:
## cor
## 0.008972589
ggplot(metadata_lissa,aes(x=`Smoking history`,y=CES.tpm)) + geom_boxplot() +
ylab("CES expression (log2TPM)")
ggplot(metadata_lissa,aes(x=`Library Type`,y=CES.tpm)) + geom_boxplot() +
ylab("CES expression (log2TPM)")
Conclusion: There is an association between CES score and smoking history and library method in the Lissa et al relapse cohort.
Calculate 50-gene signature indicating neuroendocrine differentiation (-1 to 1, non-NE vs. NE differentiation) and divide into NE (>0) and non-NE (<0)
ne50_UP = c("BEX1", "ASCL1", "INSM1", "CHGA","TAGLN3", "KIF5C", "CRMP1", "SCG3","SYT4", "RTN1", "MYT1", "SYP","KIF1A", "TMSB15A", "SYN1","SYT11", "RUNDC3A", "TFF3","CHGB", "FAM57B", "SH3GL2", "BSN","SEZ6", "TMSB15B", "CELF3")
ne50_DN = c("RAB27B", "TGFBR2", "SLC16A5","S100A10", "ITGB4", "YAP1", "LGALS3","EPHA2", "S100A16", "PLAU", "ABCC3","ARHGDIB", "CYR61", "PTGES", "CCND1","IFITM2", "IFITM3", "AHNAK", "CAV2","TACSTD2", "TGFBI", "EMP1", "CAV1","ANXA1", "MYOF")
ne50 = list(ne50_UP = genes[match(ne50_UP,genes$GeneName),]$gene_id,
ne50_DN = genes[match(ne50_DN,genes$GeneName),]$gene_id)
george.ne50 <- gsva(as.matrix(george.tpm), ne50, method="ssgsea")
## Warning: Calling gsva(expr=., gset.idx.list=., method=., ...) is deprecated;
## use a method-specific parameter object (see '?gsva').
## Warning in .filterFeatures(expr, method): 10827 genes with constant expression
## values throughout the samples.
## Estimating ssGSEA scores for 2 gene sets.
## [1] "Calculating ranks..."
## [1] "Calculating absolute values from ranks..."
##
|
| | 0%
|
|= | 2%
|
|== | 4%
|
|==== | 5%
|
|===== | 7%
|
|====== | 9%
|
|======= | 11%
|
|========= | 12%
|
|========== | 14%
|
|=========== | 16%
|
|============ | 18%
|
|============== | 19%
|
|=============== | 21%
|
|================ | 23%
|
|================= | 25%
|
|================== | 26%
|
|==================== | 28%
|
|===================== | 30%
|
|====================== | 32%
|
|======================= | 33%
|
|========================= | 35%
|
|========================== | 37%
|
|=========================== | 39%
|
|============================ | 40%
|
|============================= | 42%
|
|=============================== | 44%
|
|================================ | 46%
|
|================================= | 47%
|
|================================== | 49%
|
|==================================== | 51%
|
|===================================== | 53%
|
|====================================== | 54%
|
|======================================= | 56%
|
|========================================= | 58%
|
|========================================== | 60%
|
|=========================================== | 61%
|
|============================================ | 63%
|
|============================================= | 65%
|
|=============================================== | 67%
|
|================================================ | 68%
|
|================================================= | 70%
|
|================================================== | 72%
|
|==================================================== | 74%
|
|===================================================== | 75%
|
|====================================================== | 77%
|
|======================================================= | 79%
|
|======================================================== | 81%
|
|========================================================== | 82%
|
|=========================================================== | 84%
|
|============================================================ | 86%
|
|============================================================= | 88%
|
|=============================================================== | 89%
|
|================================================================ | 91%
|
|================================================================= | 93%
|
|================================================================== | 95%
|
|==================================================================== | 96%
|
|===================================================================== | 98%
|
|======================================================================| 100%
##
## [1] "Normalizing..."
george.ne50 = as.data.frame(t(george.ne50))
george.ne50$ne50_combined = george.ne50$ne50_UP-george.ne50$ne50_DN
metadata_george_filter$NE50.score = unlist(george.ne50[match(metadata_george_filter$`Sample-ID`,rownames(george.ne50)),]$ne50_combined)
metadata_george_filter$NE50.subtype = ifelse(metadata_george_filter$NE50.score > 0,"NE","Non-NE")
table(metadata_george_filter$NE50.subtype)
##
## NE Non-NE
## 42 15
Compare to NE50 score from Lissa et al Figure S7b source data
subtype.figs7b = read.xlsx("metadata/41467_2022_29517_MOESM9_ESM.xlsx",sheet="Fig S7b")
subtype.figs7b = merge(subtype.figs7b,metadata_george_filter[,c("Sample-ID","NE50.score")],by.x="Sample_id",by.y="Sample-ID")
ggplot(subtype.figs7b,aes(x=NE_Score,y=NE50.score)) + geom_point() + xlab("Lissa NE50 score") + ylab("NE50 score") +
geom_vline(xintercept=0,linetype="dashed") +
geom_hline(yintercept=0,linetype="dashed")
Conclusion: There is some variability between the earlier NE50 score calculation and that calculated here for the George cohort.
lissa.ne50 <- gsva(as.matrix(lissa.tpm), ne50, method="ssgsea")
## Warning: Calling gsva(expr=., gset.idx.list=., method=., ...) is deprecated;
## use a method-specific parameter object (see '?gsva').
## Warning in .filterFeatures(expr, method): 9030 genes with constant expression
## values throughout the samples.
## Estimating ssGSEA scores for 2 gene sets.
## [1] "Calculating ranks..."
## [1] "Calculating absolute values from ranks..."
##
|
| | 0%
|
|= | 1%
|
|= | 2%
|
|== | 3%
|
|=== | 4%
|
|==== | 5%
|
|==== | 6%
|
|===== | 7%
|
|====== | 8%
|
|====== | 9%
|
|======= | 10%
|
|======== | 11%
|
|======== | 12%
|
|========= | 13%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 16%
|
|============ | 17%
|
|============= | 18%
|
|============= | 19%
|
|============== | 20%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 23%
|
|================= | 24%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 27%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 30%
|
|====================== | 31%
|
|====================== | 32%
|
|======================= | 33%
|
|======================== | 34%
|
|======================== | 35%
|
|========================= | 36%
|
|========================== | 37%
|
|=========================== | 38%
|
|=========================== | 39%
|
|============================ | 40%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 43%
|
|=============================== | 44%
|
|================================ | 45%
|
|================================ | 46%
|
|================================= | 47%
|
|================================== | 48%
|
|================================== | 49%
|
|=================================== | 50%
|
|==================================== | 51%
|
|==================================== | 52%
|
|===================================== | 53%
|
|====================================== | 54%
|
|====================================== | 55%
|
|======================================= | 56%
|
|======================================== | 57%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 60%
|
|=========================================== | 61%
|
|=========================================== | 62%
|
|============================================ | 63%
|
|============================================= | 64%
|
|============================================== | 65%
|
|============================================== | 66%
|
|=============================================== | 67%
|
|================================================ | 68%
|
|================================================ | 69%
|
|================================================= | 70%
|
|================================================== | 71%
|
|================================================== | 72%
|
|=================================================== | 73%
|
|==================================================== | 74%
|
|==================================================== | 75%
|
|===================================================== | 76%
|
|====================================================== | 77%
|
|======================================================= | 78%
|
|======================================================= | 79%
|
|======================================================== | 80%
|
|========================================================= | 81%
|
|========================================================= | 82%
|
|========================================================== | 83%
|
|=========================================================== | 84%
|
|============================================================ | 85%
|
|============================================================ | 86%
|
|============================================================= | 87%
|
|============================================================== | 88%
|
|============================================================== | 89%
|
|=============================================================== | 90%
|
|================================================================ | 91%
|
|================================================================ | 92%
|
|================================================================= | 93%
|
|================================================================== | 94%
|
|================================================================== | 95%
|
|=================================================================== | 96%
|
|==================================================================== | 97%
|
|===================================================================== | 98%
|
|===================================================================== | 99%
|
|======================================================================| 100%
##
## [1] "Normalizing..."
lissa.ne50 = as.data.frame(t(lissa.ne50))
lissa.ne50$ne50_combined = lissa.ne50$ne50_UP-lissa.ne50$ne50_DN
metadata_lissa$NE50.score = unlist(lissa.ne50[match(metadata_lissa$NCI_sample_ID.RNA,rownames(lissa.ne50)),]$ne50_combined)
metadata_lissa$NE50.subtype = ifelse(metadata_lissa$NE50.score > 0,"NE","Non-NE")
table(metadata_lissa$Disease,metadata_lissa$NE50.subtype)
##
## NE Non-NE
## EP_bladder 0 1
## EP_cervix 1 2
## EP_EGFRmt_trSCLC 2 2
## EP_ovary 0 1
## EP_prostate 1 0
## EP_rectum 0 1
## EP_subglottis 0 1
## SCLC 52 36
Compare to NE50 subtype from Lissa et al Figure 2f source data
subtype.fig2f = read.xlsx("metadata/41467_2022_29517_MOESM9_ESM.xlsx",sheet="Fig 2f")
subtype.fig2f = merge(subtype.fig2f,metadata_lissa[,c("dbGap sample ID","NE50.subtype")],by.x="Sample_id",by.y="dbGap sample ID")
table(subtype.fig2f$Subtype,subtype.fig2f$NE50.subtype)
##
## NE Non-NE
## NE 39 11
## Non NE 5 17
Conclusion: There is some variability between the earlier NE50 score calculation and that calculated here for the Lissa cohort.
# Score vs score
ggplot(metadata_george_filter,aes(x=CENPA.tpm,y=NE50.score)) + geom_point(aes(color=CENPA.category,shape=NE50.subtype)) +
geom_vline(xintercept=median(metadata_george_filter$CENPA.tpm),linetype="dashed") +
geom_hline(yintercept=0,linetype="dashed") +
ggtitle("George, CENPA vs. NE50") + xlab("CENPA expression (log2TPM)") + ylab("NE50 score") +
scale_color_discrete(name="CENPA category") + scale_shape_discrete(name="NE50 subtype") +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
cor.test(metadata_george_filter$CENPA.tpm,metadata_george_filter$NE50.score,method="spearman")
##
## Spearman's rank correlation rho
##
## data: metadata_george_filter$CENPA.tpm and metadata_george_filter$NE50.score
## S = 15624, p-value = 0.0001171
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.4936479
# Score vs category
ggplot(metadata_george_filter,aes(x=NE50.subtype,y=CENPA.tpm)) + geom_boxplot() +
ggtitle("George, CENPA vs. NE50") + xlab("NE50 subtype") + ylab("CENPA expression (log2TPM)")
wilcox.test(metadata_george_filter$CENPA.tpm~metadata_george_filter$NE50.subtype)
##
## Wilcoxon rank sum exact test
##
## data: metadata_george_filter$CENPA.tpm by metadata_george_filter$NE50.subtype
## W = 490, p-value = 0.001125
## alternative hypothesis: true location shift is not equal to 0
# Category vs category
ggplot(metadata_george_filter,aes(x=NE50.subtype,fill=CENPA.category)) + geom_bar(position="fill") +
ggtitle("George, CENPA vs. NE50") + xlab("NE50 subtype") + ylab("Proportion") +
scale_fill_discrete("CENPA category")
table(metadata_george_filter$CENPA.category,metadata_george_filter$NE50.subtype)
##
## NE Non-NE
## High 25 3
## Low 17 12
chisq.test(metadata_george_filter$CENPA.category,metadata_george_filter$NE50.subtype)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: metadata_george_filter$CENPA.category and metadata_george_filter$NE50.subtype
## X-squared = 5.4175, df = 1, p-value = 0.01994
Conclusion: In the George cohort, the NE score moderately correlates with CENPA expression, and NE samples have a higher CENPA expression.
# Score vs score
ggplot(metadata_lissa,aes(x=CENPA.tpm,y=NE50.score)) + geom_point(aes(color=CENPA.category,shape=NE50.subtype)) +
geom_vline(xintercept=median(metadata_lissa$CENPA.tpm),linetype="dashed") +
geom_hline(yintercept=0,linetype="dashed") +
ggtitle("Lissa, CENPA vs. NE50") + xlab("CENPA expression (log2TPM)") + ylab("NE50 score") +
scale_color_discrete(name="CENPA category") + scale_shape_discrete(name="NE50 subtype") +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
cor.test(metadata_lissa$CENPA.tpm,metadata_lissa$NE50.score,method="spearman")
## Warning in cor.test.default(metadata_lissa$CENPA.tpm,
## metadata_lissa$NE50.score, : Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: metadata_lissa$CENPA.tpm and metadata_lissa$NE50.score
## S = 109483, p-value = 0.000476
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.3430373
# Score vs category
ggplot(metadata_lissa,aes(x=NE50.subtype,y=CENPA.tpm)) + geom_boxplot() +
ggtitle("Lissa, CENPA vs. NE50") + xlab("NE50 subtype") + ylab("CENPA expression (log2TPM)")
wilcox.test(metadata_lissa$CENPA.tpm~metadata_lissa$NE50.subtype)
##
## Wilcoxon rank sum test with continuity correction
##
## data: metadata_lissa$CENPA.tpm by metadata_lissa$NE50.subtype
## W = 1662, p-value = 0.002859
## alternative hypothesis: true location shift is not equal to 0
# Category vs category
ggplot(metadata_lissa,aes(x=NE50.subtype,fill=CENPA.category)) + geom_bar(position="fill") +
ggtitle("Lissa, CENPA vs. NE50") + xlab("NE50 subtype") + ylab("Proportion") +
scale_fill_discrete("CENPA category")
table(metadata_lissa$CENPA.category,metadata_lissa$NE50.subtype)
##
## NE Non-NE
## High 32 18
## Low 24 26
chisq.test(metadata_lissa$CENPA.category,metadata_lissa$NE50.subtype)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: metadata_lissa$CENPA.category and metadata_lissa$NE50.subtype
## X-squared = 1.9886, df = 1, p-value = 0.1585
Conclusion: There is a weaker correlation between NE score and CENPA expression in the Lissa cohort.
# Score vs score
ggplot(metadata_george_filter,aes(x=CES.tpm,y=NE50.score)) + geom_point(aes(color=CES.category,shape=NE50.subtype)) +
geom_vline(xintercept=median(metadata_george_filter$CES.tpm),linetype="dashed") +
geom_hline(yintercept=0,linetype="dashed") +
ggtitle("George, CES vs. NE50") + xlab("CES expression (log2TPM)") + ylab("NE50 score") +
scale_color_discrete(name="CES category") + scale_shape_discrete(name="NE50 subtype") +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
cor.test(metadata_george_filter$CES.tpm,metadata_george_filter$NE50.score,method="spearman")
##
## Spearman's rank correlation rho
##
## data: metadata_george_filter$CES.tpm and metadata_george_filter$NE50.score
## S = 11198, p-value = 2.24e-07
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.6370884
# Score vs category
ggplot(metadata_george_filter,aes(x=NE50.subtype,y=CES.tpm)) + geom_boxplot() +
ggtitle("George, CES vs. NE50") + xlab("NE50 subtype") + ylab("CES expression (log2TPM)")
wilcox.test(metadata_george_filter$CES.tpm~metadata_george_filter$NE50.subtype)
##
## Wilcoxon rank sum exact test
##
## data: metadata_george_filter$CES.tpm by metadata_george_filter$NE50.subtype
## W = 521, p-value = 9.195e-05
## alternative hypothesis: true location shift is not equal to 0
# Category vs category
ggplot(metadata_george_filter,aes(x=NE50.subtype,fill=CES.category)) + geom_bar(position="fill") +
ggtitle("George, CES vs. NE50") + xlab("NE50 subtype") + ylab("Proportion") +
scale_fill_discrete("CES category")
table(metadata_george_filter$CES.category,metadata_george_filter$NE50.subtype)
##
## NE Non-NE
## High 26 2
## Low 16 13
chisq.test(metadata_george_filter$CES.category,metadata_george_filter$NE50.subtype)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: metadata_george_filter$CES.category and metadata_george_filter$NE50.subtype
## X-squared = 8.5803, df = 1, p-value = 0.003398
# Score vs score
ggplot(metadata_lissa,aes(x=CES.tpm,y=NE50.score)) + geom_point(aes(color=CES.category,shape=NE50.subtype)) +
geom_vline(xintercept=median(metadata_lissa$CES.tpm),linetype="dashed") +
geom_hline(yintercept=0,linetype="dashed") +
ggtitle("Lissa, CES vs. NE50") + xlab("CES expression (log2TPM)") + ylab("NE50 score") +
scale_color_discrete(name="CES category") + scale_shape_discrete(name="NE50 subtype") +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
cor.test(metadata_lissa$CES.tpm,metadata_lissa$NE50.score,method="spearman")
##
## Spearman's rank correlation rho
##
## data: metadata_lissa$CES.tpm and metadata_lissa$NE50.score
## S = 101464, p-value = 6.545e-05
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.3911551
# Score vs category
ggplot(metadata_lissa,aes(x=NE50.subtype,y=CES.tpm)) + geom_boxplot() +
ggtitle("Lissa, CES vs. NE50") + xlab("NE50 subtype") + ylab("CES expression (log2TPM)")
wilcox.test(metadata_lissa$CES.tpm~metadata_lissa$NE50.subtype)
##
## Wilcoxon rank sum test with continuity correction
##
## data: metadata_lissa$CES.tpm by metadata_lissa$NE50.subtype
## W = 1730, p-value = 0.000551
## alternative hypothesis: true location shift is not equal to 0
# Category vs category
ggplot(metadata_lissa,aes(x=NE50.subtype,fill=CES.category)) + geom_bar(position="fill") +
ggtitle("Lissa, CES vs. NE50") + xlab("NE50 subtype") + ylab("Proportion") +
scale_fill_discrete("CES category")
table(metadata_lissa$CES.category,metadata_lissa$NE50.subtype)
##
## NE Non-NE
## High 35 15
## Low 21 29
chisq.test(metadata_lissa$CES.category,metadata_lissa$NE50.subtype)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: metadata_lissa$CES.category and metadata_lissa$NE50.subtype
## X-squared = 6.8588, df = 1, p-value = 0.008821
Conclusion: There is also a moderate correlation between NE score and CES score for both cohorts.
subtype_tf = c("ASCL1","NEUROD1","POU2F3","YAP1")
Assign SCNC subtype based on the transcription factor with the highest normalized expression within each sample.
george.tf = george.tpm[genes[match(subtype_tf,genes$GeneName),]$gene_id,]
rownames(george.tf) = subtype_tf
george.tf = apply(george.tf,2,function(x) (x-mean(x))/sd(x))
george.clust = hclust(dist(t(george.tf)))
Heatmap(matrix=george.tf,
cluster_columns = george.clust,
cluster_rows = FALSE,
show_column_names = TRUE,
show_row_dend = FALSE,
heatmap_legend_param = list(
title = "Expression\nZ-score",
labels_gp = gpar(fontsize = 12),
title_gp = gpar(fontsize = 12)
),
column_dend_height = unit(1,"inches")
)
# Assign to greatest Z-score
george.subtype = data.frame(Sample=george.clust$labels[george.clust$order],
Cluster=c(rep("SCNC-Y",2),rep("SCNC-P",8),rep("SCNC-A",41),rep("SCNC-P",1),rep("SCNC-N",5)))
george.subtype$Subtype = apply(george.tf,2,function(x) row.names(george.tf[which.max(x),,drop=FALSE]))[george.subtype$Sample]
#table(george.subtype$Cluster,george.subtype$Subtype)
# Add to metadata
metadata_george_filter$SCNC.subtype = george.subtype[match(metadata_george_filter$`Sample-ID`,
george.subtype$Sample),]$Subtype
metadata_george_filter$SCNC.subtype = mapvalues(metadata_george_filter$SCNC.subtype,subtype_tf,c("SCNC-A","SCNC-N","SCNC-P","SCNC-Y"))
table(metadata_george_filter$SCNC.subtype)
##
## SCNC-A SCNC-N SCNC-P SCNC-Y
## 41 5 9 2
Compare to SCNC assignments from Rudin et al
subtype.rudin = read.xlsx("metadata/41568_2019_133_MOESM2_ESM.xlsx")
subtype.rudin = subtype.rudin[which(subtype.rudin$Source == "George (2015)"),]
subtype.rudin$Name = gsub("T$","",subtype.rudin$Name)
subtype.rudin = merge(subtype.rudin,george.subtype,by.x="Name",by.y="Sample")
table(subtype.rudin$Subtype.assignment,subtype.rudin$Subtype)
##
## ASCL1 NEUROD1 POU2F3 YAP1
## SCLC-A 37 0 0 0
## SCLC-N 0 5 0 0
## SCLC-P 0 0 7 0
## SCLC-Y 0 0 0 2
Conclusion: The George SCNC subtypes completely correspond with subtypes assigned in Rudin et al.
lissa.tf = lissa.tpm[genes[match(subtype_tf,genes$GeneName),]$gene_id,]
rownames(lissa.tf) = subtype_tf
lissa.tf = apply(lissa.tf,2,function(x) (x-mean(x))/sd(x))
# Test with data from paper
#lissa.tf = subtype.fig2f
#rownames(lissa.tf) = subtype.fig2f$Sample_id
#lissa.tf = apply(t(lissa.tf[,subtype_tf]),2,function(x) (x-mean(x))/sd(x))
lissa.clust = hclust(dist(t(lissa.tf)))
Heatmap(matrix=lissa.tf,
cluster_columns = lissa.clust,
cluster_rows = FALSE,
show_column_names = TRUE,
show_row_dend = FALSE,
heatmap_legend_param = list(
title = "Expression\nZ-score",
labels_gp = gpar(fontsize = 12),
title_gp = gpar(fontsize = 12)
),
column_dend_height = unit(1,"inches")
)
# Assign to TF with greatest expression
lissa.subtype = data.frame(Sample=lissa.clust$labels[lissa.clust$order],
Cluster=c(rep("SCNC-Y",28),rep("SCNC-A",12),rep("SCNC-N",16),rep("SCNC-P",2),rep("SCNC-Y",19),rep("SCNC-A",23)))
lissa.subtype$Subtype = apply(lissa.tf,2,function(x) row.names(lissa.tf[which.max(x),,drop=FALSE]))[lissa.subtype$Sample]
#table(lissa.subtype$Cluster,lissa.subtype$Subtype)
# Add to metadata
metadata_lissa$SCNC.subtype = lissa.subtype[match(metadata_lissa$NCI_sample_ID.RNA,
lissa.subtype$Sample),]$Subtype
metadata_lissa$SCNC.subtype = mapvalues(metadata_lissa$SCNC.subtype,subtype_tf,c("SCNC-A","SCNC-N","SCNC-P","SCNC-Y"))
rm(lissa.subtype)
table(metadata_lissa$Disease,metadata_lissa$SCNC.subtype)
##
## SCNC-A SCNC-N SCNC-P SCNC-Y
## EP_bladder 0 1 0 0
## EP_cervix 0 1 0 2
## EP_EGFRmt_trSCLC 0 1 0 3
## EP_ovary 0 0 0 1
## EP_prostate 1 0 0 0
## EP_rectum 1 0 0 0
## EP_subglottis 0 0 0 1
## SCLC 36 15 2 35
Compare to SCNC assignments from Lissa et al Figure 2h source data
subtype.fig2h = read.xlsx("metadata/41467_2022_29517_MOESM9_ESM.xlsx",sheet="Fig 2h")
subtype.fig2h = merge(subtype.fig2h,metadata_lissa[,c("dbGap sample ID","SCNC.subtype")],
by.x="Sample_id",by.y="dbGap sample ID")
table(subtype.fig2h$Subtype,subtype.fig2h$SCNC.subtype)
##
## SCNC-A SCNC-N SCNC-P SCNC-Y
## SCNC-A 23 0 0 7
## SCNC-N 3 14 0 3
## SCNC-P 0 0 1 0
## SCNC-Y 2 1 0 18
Conclusion: Hierarchical clustering leads to less clear subtype delineations for the Lissa cohort, and several samples are differently classified compared to Lissa et al.
# Score vs category
ggplot(metadata_george_filter,aes(x=SCNC.subtype,y=CENPA.tpm)) + geom_boxplot() +
ggtitle("George, CENPA vs. SCNC") + xlab("SCNC subtype") + ylab("CENPA expression (log2TPM)")
pairwise.wilcox.test(metadata_george_filter$CENPA.tpm,metadata_george_filter$SCNC.subtype)
##
## Pairwise comparisons using Wilcoxon rank sum exact test
##
## data: metadata_george_filter$CENPA.tpm and metadata_george_filter$SCNC.subtype
##
## SCNC-A SCNC-N SCNC-P
## SCNC-N 1.00 - -
## SCNC-P 0.25 0.25 -
## SCNC-Y 1.00 0.76 1.00
##
## P value adjustment method: holm
# Category vs category
ggplot(metadata_george_filter,aes(x=SCNC.subtype,fill=CENPA.category)) + geom_bar(position="fill") +
ggtitle("George, CENPA vs. SCNC") + xlab("SCNC subtype") + ylab("Proportion") +
scale_fill_discrete("CENPA category")
table(metadata_george_filter$CENPA.category,metadata_george_filter$SCNC.subtype)
##
## SCNC-A SCNC-N SCNC-P SCNC-Y
## High 21 4 2 1
## Low 20 1 7 1
chisq.test(metadata_george_filter$CENPA.category,metadata_george_filter$SCNC.subtype)
## Warning in chisq.test(metadata_george_filter$CENPA.category,
## metadata_george_filter$SCNC.subtype): Chi-squared approximation may be
## incorrect
##
## Pearson's Chi-squared test
##
## data: metadata_george_filter$CENPA.category and metadata_george_filter$SCNC.subtype
## X-squared = 4.586, df = 3, p-value = 0.2047
Conclusion: CENPA expression is lower in the SCNC-Y subtype, but not significantly so.
# Score vs category
ggplot(metadata_lissa,aes(x=SCNC.subtype,y=CENPA.tpm)) + geom_boxplot() +
ggtitle("Lissa, CENPA vs. SCNC") + xlab("SCNC subtype") + ylab("CENPA expression (log2TPM)")
pairwise.wilcox.test(metadata_lissa$CENPA.tpm,metadata_lissa$SCNC.subtype)
## Warning in wilcox.test.default(xi, xj, paired = paired, ...): cannot compute
## exact p-value with ties
##
## Pairwise comparisons using Wilcoxon rank sum exact test
##
## data: metadata_lissa$CENPA.tpm and metadata_lissa$SCNC.subtype
##
## SCNC-A SCNC-N SCNC-P
## SCNC-N 0.974 - -
## SCNC-P 1.000 1.000 -
## SCNC-Y 0.006 0.236 0.761
##
## P value adjustment method: holm
# Category vs category
ggplot(metadata_lissa,aes(x=SCNC.subtype,fill=CENPA.category)) + geom_bar(position="fill") +
ggtitle("Lissa, CENPA vs. SCNC") + xlab("SCNC subtype") + ylab("Proportion") +
scale_fill_discrete("CENPA category")
table(metadata_lissa$CENPA.category,metadata_lissa$SCNC.subtype)
##
## SCNC-A SCNC-N SCNC-P SCNC-Y
## High 25 10 2 13
## Low 13 8 0 29
chisq.test(metadata_lissa$CENPA.category,metadata_lissa$SCNC.subtype)
## Warning in chisq.test(metadata_lissa$CENPA.category,
## metadata_lissa$SCNC.subtype): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: metadata_lissa$CENPA.category and metadata_lissa$SCNC.subtype
## X-squared = 12.107, df = 3, p-value = 0.007026
Conclusion: CENPA category differs across the SCNC subtypes, with the lowest expression in SCNC-Y.
# Score vs category
ggplot(metadata_george_filter,aes(x=SCNC.subtype,y=CES.tpm)) + geom_boxplot() +
ggtitle("George, CES vs. SCNC") + xlab("SCNC subtype") + ylab("CES expression (log2TPM)")
pairwise.wilcox.test(metadata_george_filter$CES.tpm,metadata_george_filter$SCNC.subtype)
##
## Pairwise comparisons using Wilcoxon rank sum exact test
##
## data: metadata_george_filter$CES.tpm and metadata_george_filter$SCNC.subtype
##
## SCNC-A SCNC-N SCNC-P
## SCNC-N 0.073 - -
## SCNC-P 0.129 0.012 -
## SCNC-Y 0.749 0.571 0.909
##
## P value adjustment method: holm
# Category vs category
ggplot(metadata_george_filter,aes(x=SCNC.subtype,fill=CES.category)) + geom_bar(position="fill") +
ggtitle("George, CES vs. SCNC") + xlab("SCNC subtype") + ylab("Proportion") +
scale_fill_discrete("CES category")
table(metadata_george_filter$CES.category,metadata_george_filter$SCNC.subtype)
##
## SCNC-A SCNC-N SCNC-P SCNC-Y
## High 22 4 1 1
## Low 19 1 8 1
chisq.test(metadata_george_filter$CES.category,metadata_george_filter$SCNC.subtype)
## Warning in chisq.test(metadata_george_filter$CES.category,
## metadata_george_filter$SCNC.subtype): Chi-squared approximation may be
## incorrect
##
## Pearson's Chi-squared test
##
## data: metadata_george_filter$CES.category and metadata_george_filter$SCNC.subtype
## X-squared = 7.4487, df = 3, p-value = 0.05889
Conclusion: The CES score is higher in the SCNC-N subtype vs. the SCNC-P subtype.
# Score vs category
ggplot(metadata_lissa,aes(x=SCNC.subtype,y=CES.tpm)) + geom_boxplot() +
ggtitle("Lissa, CES vs. SCNC") + xlab("SCNC subtype") + ylab("CES expression (log2TPM)")
pairwise.wilcox.test(metadata_lissa$CES.tpm,metadata_lissa$SCNC.subtype)
##
## Pairwise comparisons using Wilcoxon rank sum exact test
##
## data: metadata_lissa$CES.tpm and metadata_lissa$SCNC.subtype
##
## SCNC-A SCNC-N SCNC-P
## SCNC-N 1.0000 - -
## SCNC-P 1.0000 1.0000 -
## SCNC-Y 0.0021 0.2283 0.4736
##
## P value adjustment method: holm
# Category vs category
ggplot(metadata_lissa,aes(x=SCNC.subtype,fill=CES.category)) + geom_bar(position="fill") +
ggtitle("Lissa, CES vs. SCNC") + xlab("SCNC subtype") + ylab("Proportion") +
scale_fill_discrete("CES category")
table(metadata_lissa$CES.category,metadata_lissa$SCNC.subtype)
##
## SCNC-A SCNC-N SCNC-P SCNC-Y
## High 24 9 2 15
## Low 14 9 0 27
chisq.test(metadata_lissa$CES.category,metadata_lissa$SCNC.subtype)
## Warning in chisq.test(metadata_lissa$CES.category,
## metadata_lissa$SCNC.subtype): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: metadata_lissa$CES.category and metadata_lissa$SCNC.subtype
## X-squared = 8.0602, df = 3, p-value = 0.04478
Conclusion: The CES score is lower in the SCNC-Y subtype than the the SCNC-A subtype.
table(metadata_george_filter$NE50.subtype,metadata_george_filter$SCNC.subtype)
##
## SCNC-A SCNC-N SCNC-P SCNC-Y
## NE 36 5 1 0
## Non-NE 5 0 8 2
chisq.test(metadata_george_filter$NE50.subtype,metadata_george_filter$SCNC.subtype)
## Warning in chisq.test(metadata_george_filter$NE50.subtype,
## metadata_george_filter$SCNC.subtype): Chi-squared approximation may be
## incorrect
##
## Pearson's Chi-squared test
##
## data: metadata_george_filter$NE50.subtype and metadata_george_filter$SCNC.subtype
## X-squared = 29.775, df = 3, p-value = 1.539e-06
table(metadata_lissa$NE50.subtype,metadata_lissa$SCNC.subtype)
##
## SCNC-A SCNC-N SCNC-P SCNC-Y
## NE 32 14 0 10
## Non-NE 6 4 2 32
chisq.test(metadata_lissa$NE50.subtype,metadata_lissa$SCNC.subtype)
## Warning in chisq.test(metadata_lissa$NE50.subtype,
## metadata_lissa$SCNC.subtype): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: metadata_lissa$NE50.subtype and metadata_lissa$SCNC.subtype
## X-squared = 35.946, df = 3, p-value = 7.686e-08
Conclusion: The NE subtype is significantly associated with the SCNC subtype, as expected.
Cohort: Lissa et al (relapse/metastasis), 29 WXS tumor/normal pairs available via the Oncogenomics portal
Method:
From Oncogenomics portal (calculated from Sequenza):
From FACETS summary table:
From FACETS QC output:
From FACETS chromosome arm-level summary:
ABSOLUTE (Carter 2012 Nature Computational Biology)
Original manuscript identified genome doubling via ploidy inflection points per cancer type, which corresponds to a transition in major copy number (MCN). MCN will be primarily even if doubling has occurred. Calculated for Sequenza
Major copy number (Bielski 2018 Nat Genetics)
WGD is called if >50% of autosomal genome had MCN >= 2. Calculated for Sequenza
CIN metrics (Oza 2023 PeerJ)
Algorithms independently implemented (R package takes only array-based segment data from TCGA). Calculated for both Sequenza and FACETS. Except where noted, considers only segments with absolute log2 copy number ratio >= 0.2
aneuploidy.files = list.files("wgs/Lissa/aneuploidy",pattern="anueploidy_metrics.txt",full.names = TRUE)
aneuploidy = lapply(aneuploidy.files,function(x) read.table(x,header=TRUE,sep = '\t'))
names(aneuploidy) = gsub(".anueploidy_metrics.txt","",gsub("wgs/Lissa/aneuploidy/","",aneuploidy.files))
aneuploidy = ldply(aneuploidy,.id="Sample")
aneuploidy$wgd.mcn = ifelse(aneuploidy$wgd.mcn == "Yes",TRUE,FALSE)
aneuploidy$wgd.absolute = ifelse(aneuploidy$wgd.absolute == "Yes",TRUE,FALSE)
aneuploidy$facets.arm = ifelse(aneuploidy$facets.arm == "Yes",TRUE,FALSE)
#Add aneuploidy metric from Oncogenomics portal
frac_diploid = read.table("wgs/Lissa/23461.sequenza.summary.tsv",header=TRUE,sep='\t')
colnames(frac_diploid) = mapvalues(colnames(frac_diploid),c("Ratio"),c("Non.diploid.Ratio"))
aneuploidy = merge(aneuploidy,frac_diploid[,c("Sample.ID","Sample.Name","Case.ID","Non.diploid.Length","Non.diploid.Ratio","Total.segments")],
by.x="Sample",
by.y="Sample.ID",
all.x=TRUE)
rm(frac_diploid)
aneuploidy.continuous = c("facets.fraction_loh","facets.fga","facets.n_segs","wgd.cnmode","sequenza.CIN.TAI","sequenza.CIN.modifiedTAI","sequenza.CIN.CNA","sequenza.CIN.breakpoints","sequenza.CIN.bases","sequenza.CIN.FGA","facets.CIN.TAI","facets.CIN.modifiedTAI","facets.CIN.CNA","facets.CIN.breakpoints","facets.CIN.bases","facets.CIN.FGA","Non.diploid.Length","Non.diploid.Ratio","Total.segments")
aneuploidy.discrete = c("facets.hypoploid","facets.arm","facets.wgd","wgd.mcn","wgd.absolute")
ggplot(aneuploidy,aes(x=sequenza.cellularity,y=facets.purity)) + geom_point() +
xlab("Sequenza cellularity") + ylab("FACETS purity") +
xlim(0,1) + ylim(0,1) +
geom_smooth(method=lm)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
cor.test(aneuploidy$sequenza.cellularity,aneuploidy$facets.purity)
##
## Pearson's product-moment correlation
##
## data: aneuploidy$sequenza.cellularity and aneuploidy$facets.purity
## t = 4.6225, df = 26, p-value = 9.105e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3983867 0.8353896
## sample estimates:
## cor
## 0.6716387
ggplot(aneuploidy,aes(x=sequenza.ploidy,y=facets.ploidy)) + geom_point() +
xlab("Sequenza ploidy") + ylab("FACETS ploidy") +
xlim(1,7) + ylim(1,7) +
geom_smooth(method=lm)
## `geom_smooth()` using formula = 'y ~ x'
cor.test(aneuploidy$sequenza.ploidy,aneuploidy$facets.ploidy)
##
## Pearson's product-moment correlation
##
## data: aneuploidy$sequenza.ploidy and aneuploidy$facets.ploidy
## t = 4.8537, df = 27, p-value = 4.512e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4215869 0.8391761
## sample estimates:
## cor
## 0.6826185
Conclusion: Ploidy and purity/cellularity calls are moderately correlated between FACETS and Sequenza.
The ABSOLUTE paper (Carter 2012) identified WGDs based on the ploidy inflection point for each cancer (Supplementary Figure 9).
ggplot(aneuploidy,aes(x=rank(facets.ploidy,ties="random"),y=facets.ploidy,color=facets.wgd)) + geom_point() +
xlab("Ploidy rank") + ylab("FACETS ploidy") + scale_color_discrete(name="FACETS WGD")
Conclusion: Two ploidy inflection points are visible for SCNC, potentially corresponding to single and multiple WGD events, and the first inflection point agrees with WGD as called by FACETS.
Continuous metrics: Pearson correlation
aneuploidy.cor = cor(aneuploidy[,aneuploidy.continuous],use="pairwise.complete.obs")
Heatmap(aneuploidy.cor,col=colorRamp2(c(-1, 0, 1), c("blue", "white", "red")))
Discrete metrics: % concordant
apply(aneuploidy[,aneuploidy.discrete],2,function(x)
apply(aneuploidy[,aneuploidy.discrete],2,function(y)
sum(x == y)/length(x)))
## facets.hypoploid facets.arm facets.wgd wgd.mcn wgd.absolute
## facets.hypoploid 1.00000000 0.03448276 0.2413793 0.3103448 0.3793103
## facets.arm 0.03448276 1.00000000 0.7241379 0.7241379 0.6551724
## facets.wgd 0.24137931 0.72413793 1.0000000 0.8620690 0.8620690
## wgd.mcn 0.31034483 0.72413793 0.8620690 1.0000000 0.9310345
## wgd.absolute 0.37931034 0.65517241 0.8620690 0.9310345 1.0000000
Conclusion: There is high correlation between the aneuploidy metrics, which divide roughly into metrics based on the length of the altered genome and metrics based on the number of altered segments. FACETS and Sequenza generally agree.
For each aneuploidy metric, test for significant association with CENP-A expression and CES score. Sorted p-values are printed for all metrics (no multiple hypothesis correction) and significant associations are plotted.
metadata_lissa_aneuploidy = merge(metadata_lissa,
aneuploidy,
by.x="NCI_sample_ID.tumor",
by.y="Sample.Name")
# CENP-A TPM vs. continuous metrics
sort(apply(metadata_lissa_aneuploidy[,aneuploidy.continuous],2,function(x)
as.numeric(unlist(cor.test(metadata_lissa_aneuploidy$CENPA.tpm,x,method="spearman"))["p.value"])))
## Warning in cor.test.default(metadata_lissa_aneuploidy$CENPA.tpm, x, method =
## "spearman"): Cannot compute exact p-value with ties
## Warning in cor.test.default(metadata_lissa_aneuploidy$CENPA.tpm, x, method =
## "spearman"): Cannot compute exact p-value with ties
## Warning in cor.test.default(metadata_lissa_aneuploidy$CENPA.tpm, x, method =
## "spearman"): Cannot compute exact p-value with ties
## Warning in cor.test.default(metadata_lissa_aneuploidy$CENPA.tpm, x, method =
## "spearman"): Cannot compute exact p-value with ties
## Warning in cor.test.default(metadata_lissa_aneuploidy$CENPA.tpm, x, method =
## "spearman"): Cannot compute exact p-value with ties
## Warning in cor.test.default(metadata_lissa_aneuploidy$CENPA.tpm, x, method =
## "spearman"): Cannot compute exact p-value with ties
## facets.CIN.TAI sequenza.CIN.TAI sequenza.CIN.breakpoints
## 0.0377728 0.1114889 0.2249041
## Total.segments facets.fga facets.CIN.breakpoints
## 0.3505371 0.4518444 0.4826533
## facets.n_segs Non.diploid.Ratio facets.fraction_loh
## 0.5046517 0.5711288 0.6091188
## facets.CIN.modifiedTAI sequenza.CIN.modifiedTAI facets.CIN.CNA
## 0.6261078 0.6352256 0.7252966
## sequenza.CIN.CNA facets.CIN.FGA Non.diploid.Length
## 0.7445796 0.7601180 0.7993599
## facets.CIN.bases wgd.cnmode sequenza.CIN.bases
## 0.8530996 0.8803755 0.9745376
## sequenza.CIN.FGA
## 1.0000000
# CENP-A TPM vs. discrete metrics
sort(apply(metadata_lissa_aneuploidy[,setdiff(aneuploidy.discrete,"facets.arm")],2,function(x)
as.numeric(unlist(wilcox.test(metadata_lissa_aneuploidy$CENPA.tpm~x))["p.value"])))
## facets.hypoploid wgd.mcn wgd.absolute facets.wgd
## 0.06896552 0.42857126 0.42866858 0.64949809
ggplot(metadata_lissa_aneuploidy,aes(x=CENPA.tpm,y=facets.CIN.TAI)) + geom_point(aes(color=CENPA.category)) +
geom_vline(xintercept=median(metadata_lissa_aneuploidy$CENPA.tpm),linetype="dashed") +
geom_hline(yintercept=median(metadata_lissa_aneuploidy$facets.CIN.TAI),linetype="dashed") +
ggtitle("Lissa, CENPA vs. FACETS TAI") + xlab("CENPA expression (log2TPM)") + ylab("FACETS TAI") +
scale_color_discrete(name="CENPA category") +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
cor.test(metadata_lissa_aneuploidy$CENPA.tpm,metadata_lissa_aneuploidy$facets.CIN.TAI,method="spearman")
##
## Spearman's rank correlation rho
##
## data: metadata_lissa_aneuploidy$CENPA.tpm and metadata_lissa_aneuploidy$facets.CIN.TAI
## S = 2480, p-value = 0.03777
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.3891626
# CENP-A category vs. continuous metrics
sort(apply(metadata_lissa_aneuploidy[,aneuploidy.continuous],2,function(x)
as.numeric(unlist(wilcox.test(x~metadata_lissa_aneuploidy$CENPA.category))["p.value"])))
## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
## facets.CIN.TAI sequenza.CIN.TAI sequenza.CIN.breakpoints
## 0.02915903 0.11206979 0.14561932
## Total.segments facets.n_segs facets.CIN.breakpoints
## 0.19032105 0.27029878 0.27517440
## facets.CIN.CNA sequenza.CIN.CNA facets.CIN.modifiedTAI
## 0.40049439 0.50453927 0.50453927
## sequenza.CIN.FGA sequenza.CIN.bases facets.fga
## 0.68295388 0.74719240 0.75992421
## facets.fraction_loh Non.diploid.Ratio wgd.cnmode
## 0.79330382 0.80632816 0.87029182
## facets.CIN.bases Non.diploid.Length facets.CIN.FGA
## 0.88047764 0.91447537 0.94862324
## sequenza.CIN.modifiedTAI
## 0.98286486
# CENP-A category vs. discrete metrics
sort(apply(metadata_lissa_aneuploidy[,setdiff(aneuploidy.discrete,"facets.arm")],2,function(x)
as.numeric(unlist(chisq.test(x,metadata_lissa_aneuploidy$CENPA.category))["p.value"])))
## Warning in chisq.test(x, metadata_lissa_aneuploidy$CENPA.category): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(x, metadata_lissa_aneuploidy$CENPA.category): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(x, metadata_lissa_aneuploidy$CENPA.category): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(x, metadata_lissa_aneuploidy$CENPA.category): Chi-squared
## approximation may be incorrect
## facets.wgd wgd.mcn wgd.absolute facets.hypoploid
## 0.7633841 0.7633841 0.7978615 0.9719888
ggplot(metadata_lissa_aneuploidy,aes(x=CENPA.category,y=facets.CIN.TAI)) + geom_boxplot() +
ggtitle("Lissa, CENPA vs. FACETS TAI") + xlab("CENPA category") + ylab("FACETS TAI")
# CES TPM vs. continuous metrics
sort(apply(metadata_lissa_aneuploidy[,aneuploidy.continuous],2,function(x)
as.numeric(unlist(cor.test(metadata_lissa_aneuploidy$CES.tpm,x,method="spearman"))["p.value"])))
## Warning in cor.test.default(metadata_lissa_aneuploidy$CES.tpm, x, method =
## "spearman"): Cannot compute exact p-value with ties
## Warning in cor.test.default(metadata_lissa_aneuploidy$CES.tpm, x, method =
## "spearman"): Cannot compute exact p-value with ties
## Warning in cor.test.default(metadata_lissa_aneuploidy$CES.tpm, x, method =
## "spearman"): Cannot compute exact p-value with ties
## Warning in cor.test.default(metadata_lissa_aneuploidy$CES.tpm, x, method =
## "spearman"): Cannot compute exact p-value with ties
## Warning in cor.test.default(metadata_lissa_aneuploidy$CES.tpm, x, method =
## "spearman"): Cannot compute exact p-value with ties
## Warning in cor.test.default(metadata_lissa_aneuploidy$CES.tpm, x, method =
## "spearman"): Cannot compute exact p-value with ties
## facets.CIN.TAI sequenza.CIN.breakpoints Non.diploid.Ratio
## 0.004690237 0.088171029 0.144648162
## facets.CIN.modifiedTAI Total.segments facets.fga
## 0.153257988 0.172883176 0.175850910
## facets.CIN.breakpoints Non.diploid.Length wgd.cnmode
## 0.218774978 0.222898355 0.322606685
## sequenza.CIN.TAI sequenza.CIN.modifiedTAI facets.n_segs
## 0.326808599 0.335880928 0.346437471
## facets.CIN.FGA facets.CIN.CNA facets.CIN.bases
## 0.415419836 0.448751753 0.489950140
## sequenza.CIN.CNA sequenza.CIN.bases sequenza.CIN.FGA
## 0.576085774 0.685353870 0.692901171
## facets.fraction_loh
## 0.935274847
# CES TPM vs. discrete metrics
sort(apply(metadata_lissa_aneuploidy[,setdiff(aneuploidy.discrete,"facets.arm")],2,function(x)
as.numeric(unlist(wilcox.test(metadata_lissa_aneuploidy$CES.tpm~x))["p.value"])))
## facets.hypoploid wgd.absolute facets.wgd wgd.mcn
## 0.1379310 0.1506983 0.2785190 0.3241689
ggplot(metadata_lissa_aneuploidy,aes(x=CES.tpm,y=facets.CIN.TAI)) + geom_point(aes(color=CES.category)) +
geom_vline(xintercept=median(metadata_lissa_aneuploidy$CES.tpm),linetype="dashed") +
geom_hline(yintercept=median(metadata_lissa_aneuploidy$facets.CIN.TAI),linetype="dashed") +
ggtitle("Lissa, CES vs. FACETS TAI") + xlab("CES expression (log2TPM)") + ylab("FACETS TAI") +
scale_color_discrete(name="CES category") +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
cor.test(metadata_lissa_aneuploidy$CES.tpm,metadata_lissa_aneuploidy$facets.CIN.TAI,method="spearman")
##
## Spearman's rank correlation rho
##
## data: metadata_lissa_aneuploidy$CES.tpm and metadata_lissa_aneuploidy$facets.CIN.TAI
## S = 1966, p-value = 0.00469
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.5157635
# CES category vs. continuous metrics
sort(apply(metadata_lissa_aneuploidy[,aneuploidy.continuous],2,function(x)
as.numeric(unlist(wilcox.test(x~metadata_lissa_aneuploidy$CES.category))["p.value"])))
## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
## facets.CIN.TAI facets.fga sequenza.CIN.breakpoints
## 0.001081665 0.056839741 0.072543601
## facets.CIN.modifiedTAI Total.segments facets.CIN.breakpoints
## 0.072543601 0.080201111 0.143893481
## Non.diploid.Ratio facets.n_segs sequenza.CIN.modifiedTAI
## 0.174847813 0.244925889 0.282897456
## Non.diploid.Length sequenza.CIN.CNA wgd.cnmode
## 0.303256893 0.346724806 0.368460543
## sequenza.CIN.TAI facets.CIN.FGA facets.CIN.CNA
## 0.418719925 0.418719925 0.471097052
## facets.CIN.bases sequenza.CIN.bases sequenza.CIN.FGA
## 0.498539741 0.647057192 0.647057192
## facets.fraction_loh
## 0.739654986
# CES category vs. discrete metrics
sort(apply(metadata_lissa_aneuploidy[,setdiff(aneuploidy.discrete,"facets.arm")],2,function(x)
as.numeric(unlist(chisq.test(x,metadata_lissa_aneuploidy$CES.category))["p.value"])))
## Warning in chisq.test(x, metadata_lissa_aneuploidy$CES.category): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(x, metadata_lissa_aneuploidy$CES.category): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(x, metadata_lissa_aneuploidy$CES.category): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(x, metadata_lissa_aneuploidy$CES.category): Chi-squared
## approximation may be incorrect
## wgd.absolute facets.wgd wgd.mcn facets.hypoploid
## 0.2799424 0.3155851 0.3155851 1.0000000
ggplot(metadata_lissa_aneuploidy,aes(x=CES.category,y=facets.CIN.TAI)) + geom_boxplot() +
ggtitle("Lissa, CES vs. FACETS TAI") + xlab("CES category") + ylab("FACETS TAI")
Conclusion: The only aneuploidy metric significantly correlated with CENP-A expression or CES score is facets.CIN.TAI (FACETS-based Total Aberration Index). In general, aneuploidy does not appear to correlate with CENP-A expression.